Sanorama Hie et al., 2019
GitHub
Tutorial external API
External external API tutorial
A fix to run scran pooling normalization computeSumFactors in current python environment.
import scanpy as sc
import scanorama
import numpy as np
import pandas as pd
import os
# Working directory
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')
# rpy2
os.environ['R_HOME'] = '/home/fdeckert/bin/miniconda3/envs/p.3.8.12-FD20200109SPLENO/lib/R'
# Communication
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) # Ignore R warnings
# Convertion
from rpy2.robjects import pandas2ri
import anndata2ri
# Activate conversion globaly
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
%%R
library(scran)
# Plotting
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}
sc.set_figure_params(figsize=(5, 5))
adata = sc.read_h5ad('data/object/qc.h5ad')
adata = adata[adata.obs['doublet_class']=='Singlet'].copy()
def set_color(categories):
categories = [x for x in categories if x in list(adata.obs.columns)]
for category in categories:
adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
keys = list(color[category].keys())
keys = [x for x in keys if x in list(adata.obs[category])]
adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)
# Set colors
set_color(list(color.keys()))
adata_sub = dict()
for sample_group in adata.obs['sample_group'].unique():
adata_sub[sample_group] = adata[adata.obs['sample_group']==sample_group].copy()
adata_sub = list(adata_sub.values())
def scran_input(adata):
adata_pp = adata.copy()
# For computeSumFactors we need a good coverage for each gene
sc.pp.filter_genes(adata_pp, min_cells=20)
# Compute groups
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack')
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=1)
input_groups = adata_pp.obs['groups']
# Set count matrix
data_mat = adata.X.T
data = {'input_groups': input_groups, 'data_mat': data_mat}
return(data)
scran_input = [scran_input(x) for x in adata_sub]
data_mat = scran_input[0]['data_mat']
input_groups = list(scran_input[0]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[0].obs['size_factors'] = size_factors
data_mat = scran_input[1]['data_mat']
input_groups = list(scran_input[1]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[1].obs['size_factors'] = size_factors
data_mat = scran_input[2]['data_mat']
input_groups = list(scran_input[2]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[2].obs['size_factors'] = size_factors
data_mat = scran_input[3]['data_mat']
input_groups = list(scran_input[3]['input_groups'])
%%R -i data_mat -i input_groups -o size_factors
size_factors = sizeFactors(computeSumFactors(SingleCellExperiment(list(counts=data_mat)), clusters=input_groups, min.mean=0.1))
adata_sub[3].obs['size_factors'] = size_factors
adata_sub[0].X /= adata_sub[0].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[0])
adata_sub[1].X /= adata_sub[1].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[1])
adata_sub[2].X /= adata_sub[2].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[2])
adata_sub[3].X /= adata_sub[3].obs['size_factors'].values[:,None]
sc.pp.log1p(adata_sub[3])
sc.pp.highly_variable_genes(adata_sub[0], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[1], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[2], flavor='cell_ranger', n_top_genes=5000)
sc.pp.highly_variable_genes(adata_sub[3], flavor='cell_ranger', n_top_genes=5000)
hvg_0 = adata_sub[0].var_names[adata_sub[0].var.highly_variable]
hvg_1 = adata_sub[1].var_names[adata_sub[1].var.highly_variable]
hvg_2 = adata_sub[2].var_names[adata_sub[2].var.highly_variable]
hvg_3 = adata_sub[3].var_names[adata_sub[3].var.highly_variable]
sc.pp.scale(adata_sub[0])
sc.pp.scale(adata_sub[1])
sc.pp.scale(adata_sub[2])
sc.pp.scale(adata_sub[3])
adata_set = adata_sub.copy()
adata_sub = adata_set.copy()
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 14772 genes among all datasets Processing datasets (0, 1) Processing datasets (2, 3) Processing datasets (1, 3) Processing datasets (0, 2) Processing datasets (0, 3) Processing datasets (1, 2)
adata_sub = adata_set.copy()
hvg = list(set(hvg_0) | set(hvg_1) | set(hvg_2) | set(hvg_3))
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 8486 genes among all datasets Processing datasets (2, 3) Processing datasets (1, 3) Processing datasets (0, 1) Processing datasets (0, 2) Processing datasets (0, 3) Processing datasets (1, 2)
adata_sub = adata_set.copy()
hvg = list(set(hvg_0) & set(hvg_1) & set(hvg_2) & set(hvg_3))
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 2267 genes among all datasets Processing datasets (1, 3) Processing datasets (2, 3) Processing datasets (0, 2) Processing datasets (0, 1) Processing datasets (0, 3) Processing datasets (1, 2)
adata_sub = adata_set.copy()
hvg_nacl = list(set(hvg_0) & set(hvg_1))
hvg_cpg = list(set(hvg_2) & set(hvg_3))
hvg = list(set(hvg_nacl) | set(hvg_cpg))
adata_sub[0] = adata_sub[0][:,hvg].copy()
adata_sub[1] = adata_sub[1][:,hvg].copy()
adata_sub[2] = adata_sub[2][:,hvg].copy()
adata_sub[3] = adata_sub[3][:,hvg].copy()
# Run Scanorama
scanorama.integrate_scanpy(adata_sub, dimred=150, knn=20, verbose=True)
# Concatenate scanorama output
X_scanorama = [ad.obsm['X_scanorama'] for ad in adata_sub]
X_scanorama = np.concatenate(X_scanorama)
obs_names = [ad.obs_names for ad in adata_sub]
obs_names = np.concatenate(obs_names)
all(obs_names==adata.obs_names)
# Add X_scanorama integration to adata
adata.obsm["X_scanorama"] = X_scanorama
# Dimensional reduction and clustering
sc.pp.neighbors(adata, n_neighbors=100, use_rep='X_scanorama')
sc.tl.leiden(adata, resolution=1)
sc.tl.louvain(adata, resolution=1)
sc.tl.umap(adata)
# Plot
sc.pl.umap(adata, color=['louvain', 'leiden', 'treatment', 'label_fine_haemosphere', 'sample_rep', 'cc_phase_class', 'doublet_score_log2', 'pHb_RNA', 'pRb_RNA'], wspace=0.5, ncols=3)
Found 4517 genes among all datasets Processing datasets (2, 3) Processing datasets (1, 3) Processing datasets (0, 1) Processing datasets (0, 2) Processing datasets (0, 3) Processing datasets (1, 2)
# adata.write('data/object/int.h5ad')
# import sys
# sys.path.insert(0, '../scFacility/script')
# from dirFacility import adata2dir
# adata = adata.raw.to_adata()
# adata.layers['counts'] = adata.X
# adata2dir(adata, 'data/object/int/', assay="RNA", layers="counts", build_dir=True, overwrite=True)